#Objective: To analyze the differential expression between individuals who develop MPNST compared to individuals who don’t (pNF)

### load files

load("../data/salmon.result.metadata.Rda")
syn.query <- salmon.result[salmon.result$specimenID != 
    "2-025 Neurofibroma", ]
metadata.df <- syn.query[, colSums(is.na(syn.query)) < 
    nrow(syn.query)]
cols.keep <- c("specimenID", "id", "individualID", 
    "sex", "tumorType", "isCellLine", "isPrimaryCell", 
    "tissue", "experimentalCondition", "consortium", 
    "study", "transplantationType")
metadata.df <- metadata.df[, cols.keep]

rownames(metadata.df) <- metadata.df$specimenID

filter for protein coding genes

From the transcript counts step to make sure no unprocessed transcripts, etc are included in the counts. Also only includes genes with gene symbols.

load("../data/prot_coding_counts_expData.Rda")

data.mat = reshape2::acast(expData, Symbol ~ specimenID, 
    value.var = "roundedCounts")
missing = which(apply(data.mat, 1, function(x) any(is.na(x))))
if (length(missing) > 0) data.mat = data.mat[-missing, 
    ]

### manually change name of 2-004 specID
colnames(data.mat)[8] <- "2-004 Plexiform Neurofibroma"

### reorder data.mat
data.mat <- data.mat[, rownames(metadata.df)]
### filter for pNFs that progress/dont to MPNST

# have pNF, have MPNST
pNF_progress <- (metadata.df %>% filter(individualID %in% 
    c("2-002", "2-003", "2-009", "2-013", "2-015", 
        "2-016", "2-023", "2-031") & tumorType == "Plexiform Neurofibroma"))$specimenID

# have pNF, don't have MPNST
pNF_noProgress <- (metadata.df %>% filter(!(individualID %in% 
    c("2-002", "2-003", "2-009", "2-013", "2-015", 
        "2-016", "2-023", "2-031")) & tumorType == 
    "Plexiform Neurofibroma"))$specimenID

### just pNF MPNST
metadata.df_filtered <- metadata.df %>% filter(tumorType == 
    "Plexiform Neurofibroma")

### add progress to MPNST col
metadata.df_filtered$MPNST_progression <- "progress"
metadata.df_filtered[which(metadata.df_filtered$specimenID %in% 
    pNF_progress), ]$MPNST_progression <- "progress"

metadata.df_filtered[which(metadata.df_filtered$specimenID %in% 
    pNF_noProgress), ]$MPNST_progression <- "no_progress"

### shorten words and remove spaces for downstream
metadata.df_filtered$tumorType <- gsub("Plexiform Neurofibroma", 
    "pNF", metadata.df_filtered$tumorType)

filtered_ids <- metadata.df_filtered$specimenID

data.mat <- data.mat[, filtered_ids]

metadata.df_filtered$specimenID <- gsub(" Plexiform Neurofibroma", 
    "pNF", metadata.df_filtered$specimenID)

### change colname in txi to match
colnames(data.mat) <- metadata.df_filtered$specimenID

rownames(metadata.df_filtered) <- metadata.df_filtered$specimenID

run DESEQ

filter out counts < 10 and set alpha to .05

# detach('package:synapser', unload=TRUE)
# unloadNamespace('PythonEmbedInR')
dds <- DESeqDataSetFromMatrix(countData = data.mat, 
    colData = metadata.df_filtered, design = ~MPNST_progression)

### filter out reads with low counts
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]

dds <- DESeq(dds)

res <- results(dds, alpha = 0.05)

resultsNames(dds)
## [1] "Intercept"                                
## [2] "MPNST_progression_progress_vs_no_progress"
resLFC <- lfcShrink(dds, coef = 2, type = "apeglm")  # plot looks weird with this type
# resLFC <- lfcShrink(dds, coef = 3, type='normal')
res
## log2 fold change (MLE): MPNST progression progress vs no progress 
## Wald test p-value: MPNST progression progress vs no progress 
## DataFrame with 17134 rows and 6 columns
##                 baseMean     log2FoldChange             lfcSE
##                <numeric>          <numeric>         <numeric>
## A2M     192620.141165638    1.2455849772124  1.37304432686945
## A2ML1   99.0484277047366  -1.47466203827885  1.17525586293471
## A3GALT2 7.78461394662319  0.258387312214684  1.53382824232674
## A4GALT  1721.88443067838 -0.400371382201492 0.516065131970599
## A4GNT   1.30935703259983  -1.01842624308863  2.06407610591747
## ...                  ...                ...               ...
## ZXDC    1140.88565560739   0.13446750582303 0.239422567321793
## ZYG11A  77.8995768418035  -3.78847839867316    1.392185152915
## ZYG11B  3655.22169560955 -0.150338630687026  0.27809965996628
## ZYX     8771.76995776608  0.423025104653367 0.460148987732322
## ZZEF1   2964.62897420378  0.359348058194729 0.435090664921054
##                       stat              pvalue              padj
##                  <numeric>           <numeric>         <numeric>
## A2M      0.907170258699759   0.364316768757964 0.762692129227961
## A2ML1    -1.25475829118308   0.209566517838882 0.659690701578338
## A3GALT2  0.168459091496922   0.866222123952563 0.973165064688762
## A4GALT  -0.775815604268148   0.437857869132013 0.808646094365473
## A4GNT   -0.493405374040675   0.621726185920339 0.900126265073314
## ...                    ...                 ...               ...
## ZXDC     0.561632545031983   0.574366401837014 0.879062425858905
## ZYG11A   -2.72124608622692 0.00650363266360434 0.307430821223801
## ZYG11B  -0.540592644756398   0.588788388686632 0.884581657410083
## ZYX      0.919322036843095    0.35792715468426 0.758745150347109
## ZZEF1     0.82591534860885    0.40885211322699 0.790269538420699
summary(res)
## 
## out of 17134 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 33, 0.19%
## LFC < 0 (down)     : 45, 0.26%
## outliers [1]       : 518, 3%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

The LFC is of the MPNST progress / no progress

# resLFC

check outliers

par(mar = c(8, 5, 2, 2))
boxplot(log10(assays(dds)[["cooks"]]), range = 0, las = 2)

check dispersion

Dispersion is a metric for how much the variance differs from the mean in a negative binomial distribution.

plotDispEsts(dds)

MA plots

plotMA(res)

logFC shrink

plotMA(resLFC)  #, ylim=c(-2,2))

res_df = as.data.frame(dplyr::mutate(as.data.frame(res), 
    sig = ifelse(res$padj < 0.05, "FDR<0.05", "Not Sig")), 
    row.names = rownames(res))

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# mart <- useMart(biomart = 'ENSEMBL_MART_ENSEMBL',
# dataset = 'hsapiens_gene_ensembl', host =
# 'uswest.ensembl.org')
write.csv(res_df, file = "pNF_v_MPNST_geneList.csv", 
    row.names = TRUE)

heatmap of standardized vsd count matrix

normalized by variance stabilizing transformation and displaying the top 50 highest expressed genes

vsd <- vst(dds, blind = FALSE)
# rename gene
vsd_matrix <- SummarizedExperiment::assay(vsd)

select <- order(rowMeans(vsd_matrix), decreasing = TRUE)[1:50]

top_genes <- vsd_matrix[select, ]

top_genes = top_genes - rowMeans(top_genes)  # Subtract the row means from each value to standardize

df <- as.data.frame(colData(dds)[, c("sex", "MPNST_progression")])

pheatmap(top_genes, cluster_rows = TRUE, show_rownames = TRUE, 
    cluster_cols = TRUE, annotation_col = df, color = rev(brewer.pal(11, 
        "Spectral")))

Just by highest expressed genes they don’t cluster as expected, there is a pNF with no MPST progression that doesn’t cluster with other no MPST progression individuals

heatmap of vsd top 50 padj standardized

rld <- rlog(dds, blind = F)
# exp_matrix <- SummarizedExperiment::assay(rld)
# ### top 100 select <- order(rowMeans(exp_matrix),
# decreasing=TRUE)[1:20] top100 <-
# exp_matrix[select,] annotation_data <-
# as.data.frame(colData(rld)['MPNST_progression'] )
# pheatmap(exp_matrix,
# annotation_col=annotation_data, color =
# rev(brewer.pal(11, 'Spectral')))


mat <- assay(vsd)
mat_df <- as.data.frame(mat)

mat_prot_df <- mat_df[head(order(res[rownames(res), 
    ]$padj), 50), ]  # select the top 50 genes with the lowest padj
mat_prot_df = mat_prot_df - rowMeans(mat_prot_df)  # Subtract the row means from each value to standardize

## remove gene name rows
mat_prot_matrix <- as.matrix(mat_prot_df)

df = as.data.frame(colData(rld)[, c("sex", "MPNST_progression")])  # Create a dataframe with a column of the conditions
rownames(df) = colnames(mat)  # add rownames
# and plot the actual heatmap
pheatmap(mat_prot_matrix, annotation_col = df, color = rev(brewer.pal(11, 
    "Spectral")))

heatmap of samp to samp distance

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$specimenID, 
    vsd$MPNST_progression, sep = "-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists, 
    clustering_distance_cols = sampleDists, col = colors, 
    show_rownames = TRUE)

They don’t cluster by progression type

pca by sample and sex

p <- plotPCA(vsd, intgroup = c("individualID", "MPNST_progression", 
    "sex"), returnData = TRUE)
percentVar <- round(100 * attr(p, "percentVar"))
ggplot(p, aes(PC1, PC2, color = individualID, shape = sex, 
    label = vsd$MPNST_progression)) + geom_point(size = 3) + 
    xlab(paste0("PC1: ", percentVar[1], "% variance")) + 
    ylab(paste0("PC2: ", percentVar[2], "% variance")) + 
    coord_fixed() + ggrepel::geom_text_repel()

volcano plot

### https://www.biostars.org/p/282295/
par(mar = c(5, 5, 5, 5), cex = 1, cex.main = 1.4, cex.axis = 1.4, 
    cex.lab = 1.4)

topT <- as.data.frame(res)

# Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj), pch = 20, 
    main = "Volcano plot", cex = 1, xlab = bquote(~Log[2] ~ 
        fold ~ change), ylab = bquote(~-log[10] ~ p ~ 
        adj)))
with(subset(topT, padj < 0.05 & abs(log2FoldChange) > 
    2), points(log2FoldChange, -log10(padj), pch = 20, 
    col = "red", cex = 1))

# Add lines for absolute FC>2 and P-value cut-off
# at FDR Q<0.05
abline(v = 0, col = "black", lty = 3, lwd = 1)
abline(v = -2, col = "black", lty = 4, lwd = 2)
abline(v = 2, col = "black", lty = 4, lwd = 2)
abline(h = -log10(max(topT$pvalue[topT$padj < 0.05], 
    na.rm = TRUE)), col = "black", lty = 4, lwd = 2)

GO Terms

# https://shiring.github.io/rna-seq/deseq2/teaching/2016/09/29/DESeq2-course
# get entrezid
entrez_list <- getBM(filters = "hgnc_symbol", attributes = c("hgnc_symbol", 
    "entrezgene_id"), values = rownames(mat_df), mart = mart)
mat_df$hgnc_symbol <- rownames(mat_df)
mat_df <- inner_join(mat_df, entrez_list, by = "hgnc_symbol")

This is the GO profile using a biological processes subontology

OrgDb <- org.Hs.eg.db  # can also be other organisms
gene <- na.omit(mat_df$entrezgene)

# Group GO
ggo <- clusterProfiler::groupGO(gene = as.character(gene), 
    OrgDb = OrgDb, ont = "BP", level = 3, readable = TRUE)
## Loading required package: DOSE
## DOSE v3.8.2  For help: https://guangchuangyu.github.io/DOSE
## 
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
barplot(ggo, drop = TRUE, showCategory = 12)

GO over-representation test

Using a p value and q value of .05, this returns the enrichment GO categories. Shown are the top 6

# GO over-representation test
ego <- clusterProfiler::enrichGO(gene = gene, OrgDb = OrgDb, 
    ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, 
    qvalueCutoff = 0.05, readable = TRUE)

DT::datatable(head(summary(ego)[, -8]), rownames = FALSE)
## Warning in summary(ego): summary method to convert the object to data.frame
## is deprecated, please use as.data.frame instead.
write.csv(ego, file = "pNFvMPNST_GO_overRep.csv", row.names = FALSE)

###KEGG pathways

## KEGG
kk <- clusterProfiler::enrichKEGG(gene = gene, organism = "hsa", 
    pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.05)
DT::datatable(head(summary(kk)[, -8]), rownames = FALSE)
write.csv(kk, file = "pNFvMPNST_KEGG.csv", row.names = FALSE)